In [12]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
import nltk
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
import lda

In [13]:
app = pd.read_pickle('app_cleaned.pickle')

In [14]:
app.columns


Out[14]:
Index([u'category', u'current_rating', u'description', u'id',
       u'is_InAppPurcased', u'is_multilingual', u'is_multiplatform', u'name',
       u'new_version_desc', u'num_current_rating', u'num_overall_rating',
       u'overall_rating', u'price', u'publish_date', u'review1',
       u'review1_star', u'review2', u'review2_star', u'review3',
       u'review3_star', u'scrape_date', u'seller', u'size', u'update_date',
       u'url', u'version'],
      dtype='object')

In [ ]:
app['weighted_rating'] = map(lambda a, b, c,d: np.divide(a,b)*c+(1-np.divide(a,b))*d, app['num_current_rating'], 
                                  app['num_overall_rating'], app['current_rating'], app['overall_rating'])

Simply use the metric we created to define the quality of a app. If the weighted rating is no less than 4.0, it can be seen as a good app. If the weighted rating is no more than 2.5, it is a bad app.


In [18]:
good_app = app.loc[app['weighted_rating'] >=4.0]
bad_app = app.loc[app['weighted_rating'] <=2.5]
good_app = good_app.reset_index(drop=True)
bad_app = bad_app.reset_index(drop=True)

In [17]:
category = app['category']
cate_list = []
for i in category.unique():
    cate = i.lower()
    cate_list.append(cate)

Use star value of different reviews to filter comments.


In [19]:
first_good= good_app.loc[good_app['review1_star']>=4].reset_index(drop=True)['review1']
second_good = good_app.loc[good_app['review2_star']>=4].reset_index(drop=True)['review2']
third_good = good_app.loc[good_app['review3_star']>=4].reset_index(drop=True)['review3']
first_bad = bad_app.loc[bad_app['review1_star']<=2.5].reset_index(drop=True)['review1']
second_bad = bad_app.loc[bad_app['review2_star']<=2.5].reset_index(drop=True)['review2']
third_bad = bad_app.loc[bad_app['review3_star']<=2.5].reset_index(drop=True)['review3']

In [20]:
good_rev = first_good.append(second_good)
all_good = good_rev.append(third_good)
bad_rev = first_bad.append(second_bad)
all_bad = bad_rev.append(third_bad)

Cleaning and Preprocessing

Data cleaning is absolutely crucial for generating a useful topic model. The steps below are common to most natural language processing methods:

  • Tokenizing: converting a document to its atomic elements.
  • Stopping: removing meaningless words.
  • Stemming: merging words that are equivalent in meaning.

Here we need to note that POS tag filter is more about the context of the features than frequencies of features. Topic Modelling tries to map out the recurring patterns of terms into topics. However, every term might not be equally important contextually. For example, POS tag IN contain terms such as – “within”, “upon”, “except”. “CD” contains – “one”,”two”, “hundred” etc. “MD” contains “may”, “must” etc. These terms are the supporting words of a language and can be removed by studying their post tags.


In [21]:
stop = set(stopwords.words('english')+[u'one',u'app',u'it',u'dont',u"i",u"'s","''","``",u'use',u'used',u'using',u'love',
                                      u'would',u'great',u'app.',u'like',u'lot']+ cate_list)
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def stem(tokens,stemmer = PorterStemmer().stem):
    return [stemmer(w.lower()) for w in tokens if w not in stop]

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    tokenize = nltk.word_tokenize
    to_token = stem(tokenize(normalized))
    tags = nltk.pos_tag(to_token)
    dt_tags = [t for t in tags if t[1] in ["DT", "MD", "VBP","IN", "JJ","VB"]]
    for tag in dt_tags:
        normalized = " ".join(tok for tok in to_token if tok not in tag[0])
    return normalized

In [22]:
doc_clean_g1 = [clean(doc).split() for doc in first_good]
doc_clean_g2 = [clean(doc).split() for doc in second_good]
doc_clean_g3 = [clean(doc).split() for doc in third_good]
doc_clean_b1 = [clean(doc).split() for doc in first_bad]
doc_clean_b2 = [clean(doc).split() for doc in second_bad]
doc_clean_b3 = [clean(doc).split() for doc in third_bad]

In [23]:
doc_clean_good = [clean(doc).split() for doc in all_good]
doc_clean_bad = [clean(doc).split() for doc in all_bad]

Preparing Document-Term Matrix

  • Convert a corpus into a document-term matrix. LDA model looks for repeating term patterns in the entire DT matrix

In [24]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary_g1 = corpora.Dictionary(doc_clean_g1)
dictionary_g2 = corpora.Dictionary(doc_clean_g2)
dictionary_g3 = corpora.Dictionary(doc_clean_g3)
dictionary_b1 = corpora.Dictionary(doc_clean_b1)
dictionary_b2 = corpora.Dictionary(doc_clean_b2)
dictionary_b3 = corpora.Dictionary(doc_clean_b3)

In [25]:
dictionary_good = corpora.Dictionary(doc_clean_good)
dictionary_bad = corpora.Dictionary(doc_clean_bad)

In [26]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix_g1 = [dictionary_g1.doc2bow(doc) for doc in doc_clean_g1]
doc_term_matrix_g2 = [dictionary_g2.doc2bow(doc) for doc in doc_clean_g2]
doc_term_matrix_g3 = [dictionary_g3.doc2bow(doc) for doc in doc_clean_g3]
doc_term_matrix_b1 = [dictionary_b1.doc2bow(doc) for doc in doc_clean_b1]
doc_term_matrix_b2 = [dictionary_b2.doc2bow(doc) for doc in doc_clean_b2]
doc_term_matrix_b3 = [dictionary_b3.doc2bow(doc) for doc in doc_clean_b3]

In [27]:
doc_term_matrix_good = [dictionary_good.doc2bow(doc) for doc in doc_clean_good]
doc_term_matrix_bad = [dictionary_bad.doc2bow(doc) for doc in doc_clean_bad]

Running LDA Model (Batch Wise LDA)

  • According to the reference, in order to retrieve most important topic terms, a corpus can be divided into batches of fixed sizes. Running LDA multiple times on these batches will provide different results, however, the best topic terms will be the intersection of all batches.

In [28]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [29]:
# Running and Trainign LDA model on the document term matrix.
ldamodel_g1 = Lda(doc_term_matrix_g1, num_topics=3, id2word = dictionary_g1, passes=50)
ldamodel_g2 = Lda(doc_term_matrix_g2, num_topics=3, id2word = dictionary_g2, passes=50)
ldamodel_g3 = Lda(doc_term_matrix_g3, num_topics=3, id2word = dictionary_g3, passes=50)
ldamodel_b1 = Lda(doc_term_matrix_b1, num_topics=3, id2word = dictionary_b1, passes=50)
ldamodel_b2 = Lda(doc_term_matrix_b2, num_topics=3, id2word = dictionary_b2, passes=50)
ldamodel_b3 = Lda(doc_term_matrix_b3, num_topics=3, id2word = dictionary_b3, passes=50)

Examining the results


In [30]:
print(ldamodel_g1.print_topics(num_topics=3, num_words=5))
print(ldamodel_g2.print_topics(num_topics=3, num_words=5))
print(ldamodel_g3.print_topics(num_topics=3, num_words=5))


[(0, u'0.009*"get" + 0.007*"im" + 0.006*"help" + 0.006*"realli" + 0.006*"work"'), (1, u'0.014*"time" + 0.012*"game" + 0.009*"get" + 0.008*"play" + 0.006*"fun"'), (2, u'0.009*"easi" + 0.008*"get" + 0.008*"time" + 0.007*"make" + 0.007*"work"')]
[(0, u'0.012*"time" + 0.007*"also" + 0.006*"ive" + 0.005*"work" + 0.005*"go"'), (1, u'0.007*"time" + 0.007*"work" + 0.006*"app" + 0.005*"photo" + 0.004*"thank"'), (2, u'0.014*"get" + 0.009*"game" + 0.009*"time" + 0.008*"realli" + 0.008*"im"')]
[(0, u'0.008*"time" + 0.008*"get" + 0.008*"work" + 0.008*"see" + 0.007*"make"'), (1, u'0.013*"get" + 0.008*"help" + 0.006*"good" + 0.005*"well" + 0.005*"look"'), (2, u'0.012*"time" + 0.010*"ive" + 0.008*"best" + 0.007*"work" + 0.007*"make"')]

Each generated topic is separated by a comma. Within each topic are the five most probable words to appear in that topic. The best topic terms will be the intersection of all three batches. Some things to think about, for the good app, the comments have common features like:

  1. It's free have some good features that satisfy customers' demand.
  2. It has many good information and details, and customers are comfortable at vision, like screen.
  3. The speed is awesome and save some time.
  4. It provids some help when customers using it.

In [31]:
print(ldamodel_b1.print_topics(num_topics=3, num_words=5))
print(ldamodel_b2.print_topics(num_topics=3, num_words=5))
print(ldamodel_b3.print_topics(num_topics=3, num_words=5))


[(0, u'0.007*"time" + 0.007*"peopl" + 0.006*"tri" + 0.005*"websit" + 0.005*"im"'), (1, u'0.012*"get" + 0.012*"time" + 0.008*"work" + 0.007*"want" + 0.007*"even"'), (2, u'0.012*"time" + 0.009*"work" + 0.007*"even" + 0.007*"get" + 0.007*"go"')]
[(0, u'0.015*"time" + 0.014*"work" + 0.010*"get" + 0.008*"tri" + 0.008*"make"'), (1, u'0.010*"get" + 0.010*"time" + 0.007*"work" + 0.006*"tri" + 0.006*"even"'), (2, u'0.009*"get" + 0.008*"time" + 0.007*"work" + 0.007*"tri" + 0.007*"updat"')]
[(0, u'0.015*"time" + 0.008*"tri" + 0.007*"work" + 0.006*"give" + 0.006*"back"'), (1, u'0.011*"time" + 0.009*"work" + 0.009*"get" + 0.007*"need" + 0.007*"even"'), (2, u'0.010*"work" + 0.008*"get" + 0.007*"time" + 0.006*"even" + 0.006*"download"')]

For the bad apps, from the result, we can see most topics include the word "time". We can refer that customers are not satisfied for the using fluency of these apps. And for the updated version of these apps, they doesn't work sometimes, maybe because flashing back. Meanwhile, compared with the last version, these updated apps maybe designed not that good.

Running LDA Model (For the whole documents)


In [32]:
ldamodel_good = Lda(doc_term_matrix_good, num_topics=10, id2word = dictionary_good, passes=20)
ldamodel_bad = Lda(doc_term_matrix_bad, num_topics=10, id2word = dictionary_bad, passes=20)

In [32]:
print(ldamodel_good.print_topics(num_topics=5, num_words=3))
print(ldamodel_bad.print_topics(num_topics=5, num_words=3))


[(5, u'0.040*open + 0.029*see + 0.027*screen'), (2, u'0.013*time + 0.011*im + 0.011*get'), (9, u'0.030*accur + 0.027*also + 0.025*time'), (7, u'0.026*day + 0.020*everi + 0.014*help'), (3, u'0.021*inform + 0.020*locat + 0.020*map')]
[(6, u'0.013*time + 0.011*work + 0.010*even'), (9, u'0.014*work + 0.013*time + 0.009*doesnt'), (0, u'0.013*get + 0.012*work + 0.012*time'), (5, u'0.012*page + 0.011*time + 0.010*get'), (2, u'0.016*time + 0.009*need + 0.007*show')]

In [56]:
import pyLDAvis
import pyLDAvis.gensim


/Users/liwenyu/anaconda/envs/my_env/lib/python2.7/site-packages/simplejson/encoder.py:262: DeprecationWarning: Interpreting naive datetime as local 2017-03-22 18:42:34.119736. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)

In [ ]:
pyLDAvis.enable_notebook()
good_rev = pyLDAvis.gensim.prepare(ldamodel_good, doc_term_matrix_good, dictionary_good)
bad_rev = pyLDAvis.gensim.prepare(ldamodel_bad, doc_term_matrix_bad, dictionary_bad)

In [65]:
pyLDAvis.save_html(good_rev,"good_rev.html")
good_rev


Out[65]:
/Users/liwenyu/anaconda/envs/my_env/lib/python2.7/site-packages/simplejson/encoder.py:262: DeprecationWarning: Interpreting naive datetime as local 2017-03-22 18:56:13.639523. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)

In [63]:
bad_rev
pyLDAvis.save_html(bad_rev,"bad_rev.html")
bad_rev


Out[63]:
/Users/liwenyu/anaconda/envs/my_env/lib/python2.7/site-packages/simplejson/encoder.py:262: DeprecationWarning: Interpreting naive datetime as local 2017-03-22 18:55:25.280533. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)

In [ ]: